Clear all variables before starting
rm(list=ls())
Load required libraries
library(text2vec)
library(data.table)
library(magrittr)
library(sentimentr)
library(glmnet)
library(vip)
Here, we load the train and test data into their respective datasets. Additionally, we make use of the crowdflower_weather dataset available after loading the sentimentr package. We first have to convert the crowdflower_weather dataset’s sentiments to a similar scale as our dataset (-1 -> 1, 0 -> 2, 1 -> 3). After which, we have an expanded training dataset.
twitter_train <- read.csv("train.csv", stringsAsFactors=FALSE)
#data(crowdflower_weather)
cfw <- data.frame(crowdflower_weather, stringsAsFactors=FALSE)
cfw <- cfw[, c("text","sentiment")]
cfw$sentiment <- cfw$sentiment + 2
names(cfw)[names(cfw) == "text"] <- "tweet"
twitter_test <- read.csv('test.csv', stringsAsFactors = FALSE)
twitter_test$sentiment <- 0
twitter_train <- rbind(twitter_train, cfw)
For the tokenization we are required to include the training dataset ids and assign them here.
for(i in 1:nrow(twitter_train)){
twitter_train$ID[i] <- i
}
Next, we have our data pre-processing and tokenization steps.
prep_fun = tolower
tok_fun = word_tokenizer
it_train = itoken(twitter_train$tweet, preprocessor = prep_fun, tokenizer = tok_fun, ids = twitter_train$ID, progressbar = FALSE)
vocab = create_vocabulary(it_train, ngram = c(1L, 2L))
vocab = prune_vocabulary(vocab, term_count_min = 10, doc_proportion_max = 0.5)
it_test = tok_fun(prep_fun(twitter_test$tweet))
it_test = itoken(it_test, ids = twitter_test$ID, progressbar = FALSE)
bigram_vectorizer = vocab_vectorizer(vocab)
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_train = create_dtm(it_train, bigram_vectorizer)
# tfidf modified by fit_transform() call!
dtm_train_tfidf = fit_transform(dtm_train, tfidf)
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf = create_dtm(it_test, bigram_vectorizer)
dtm_test_tfidf = transform(dtm_test_tfidf, tfidf)
After running a k-fold cross-validation with k=10, we find the optimal lambda to tune our model later on.
set.seed(6)
k = 10
glmnet_classifier = cv.glmnet(x = dtm_train_tfidf, y = twitter_train[['sentiment']], family = 'multinomial', alpha = 1, type.measure = "class", nfolds = k)
# graph of cross validated mean error against lambda
plot(glmnet_classifier$lambda,glmnet_classifier$cvm)
# smallest cvm
glmnet_classifier$cvm[glmnet_classifier$lambda==glmnet_classifier$lambda.min]
## [1] 0.1043288
# find optimal lambda
glmnet_classifier$lambda.min
## [1] 0.002376492
# variable importance of model
vip(glmnet_classifier)
To make predictions on the test dataset, we use the predict function and set s to be the optimal lambda of 0.002376492 and type as “response”. After which, we add these predictions to the test dataset by selecting the sentiment with the maximum probability. Finally, we prepare the script for submission.
preds = predict(glmnet_classifier, dtm_test_tfidf, s = 0.002376492, type = 'response')
for(i in 1:nrow(preds)){
twitter_test$sentiment[i] <- as.character(which.max(preds[i,,]))
}
#head(twitter_test)
twitter_submission <- cbind(as.numeric(twitter_test$id), as.numeric(twitter_test$sentiment))
colnames(twitter_submission) <- c("id", "sentiment")
#write.csv(twitter_submission,'SubmissionCVGlmnet.csv',row.names = FALSE)